##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## /var/folders/qw/2tnkb3b11dncn1d6lmqs7rh40000gn/T//RtmpT5ZBxS/h2o_krishnaprasad_started_from_r.out
## /var/folders/qw/2tnkb3b11dncn1d6lmqs7rh40000gn/T//RtmpT5ZBxS/h2o_krishnaprasad_started_from_r.err
##
##
## Starting H2O JVM and connecting: .. Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 seconds 864 milliseconds
## H2O cluster timezone: America/Denver
## H2O data parsing timezone: UTC
## H2O cluster version: 3.28.0.2
## H2O cluster version age: 1 month and 6 days
## H2O cluster name: H2O_started_from_R_krishnaprasad_aty884
## H2O cluster total nodes: 1
## H2O cluster total memory: 4.00 GB
## H2O cluster total cores: 12
## H2O cluster allowed cores: 12
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 3.6.1 (2019-07-05)
# Read Classification dataset from RDS
class.data <- readRDS("KDEN_Class_Data_New.RDS")
# Reorder data frame by Date
class.data <- class.data[order(class.data$DATE), ]
# class.data$PRCP_LAG_1 <- lag(class.data$PRCP, k = 1)
# class.data$TEMP_LAG_1 <- lag(class.data$TEMP, k = 1)
class.data <- class.data[complete.cases(class.data), ]
class.data <-
class.data[!(class.data$MXSPD == 999.9 |
class.data$PRCP == 99.99), ] #| class.data$PRCP_LAG_1 == 99.99
class.data$FOG <- as.factor(class.data$FOG)
class.data$SNOW_ICE <- as.factor(class.data$SNOW_ICE)
class.data <- class.data %>%
mutate(
WEEK = lubridate::week(class.data$DATE),
YEAR = lubridate::year(class.data$DATE)
) %>%
mutate(RATIO = STRIKECOUNT / FLIGHTCOUNT * 10000)
t.data <- class.data %>%
mutate(RATIO = STRIKECOUNT / FLIGHTCOUNT * 10000) %>%
group_by(YEAR, WEEK) %>%
summarise(RATIO = mean(RATIO)) %>%
mutate(RISK = .bincode(
RATIO,
breaks = quantile(RATIO, probs = seq(0, 1, 1 / 3)),
include.lowest = TRUE
) - 1) %>%
dplyr::select(-RATIO)
class.data <-
left_join(class.data, t.data, by = c("YEAR" = "YEAR", "WEEK" = "WEEK"))
class.data$RISK <-
as.factor(ifelse(class.data$RISK == 0, "L", ifelse(class.data$RISK == 1, "M", "H")))
# one-hot-encoding categorical features
ohe_feats = c('MONTH')
# Create dummies
dummies <- dummyVars(~ MONTH, data = class.data)
df.dummies <- as.data.frame(predict(dummies, newdata = class.data))
# Merge Dummies to data frame
class.data <-
cbind(class.data[, -c(which(colnames(class.data) %in% ohe_feats))], df.dummies)
valid.cl.data <-
class.data[(class.data$YEAR == 2019), ]
class.data <-
class.data %>% filter(!class.data$YEAR %in% c(1995:2007, 2019))
class.data <- subset(class.data, select = -c(DATE, YEAR, SEASON, MXSPD, SNOW_ICE, STRIKECOUNT, STRIKE, WEEK, RATIO, MONTH.12))
valid.cl.data <- subset(valid.cl.data, select = -c(DATE, YEAR, SEASON, MXSPD, SNOW_ICE, STRIKECOUNT, STRIKE, WEEK, RATIO, MONTH.12))
# Create the training and test datasets
set.seed(100)
class.data$RISK <- as.factor(class.data$RISK)
# Step 1: Get row numbers for the training data
trainRowNumbers.cl <-
createDataPartition(class.data$RISK, p = 0.70, list = FALSE)
# Step 2: Create the training dataset
train.data <- class.data[trainRowNumbers.cl, ]
# Step 3: Create the test dataset
test.data <- class.data[-trainRowNumbers.cl, ]
validateAndPrintResult <- function(model, data) {
# Summarise Results
print(model)
## run MLeval
res <- evalm(model)
## get ROC
res$roc
## get calibration curve
res$cc
## get precision recall gain curve
res$prg
# Predict on testData
predicted.resp <- predict(model, data)
head(predicted.resp)
caret::confusionMatrix(
reference = as.factor(data$RISK),
data = predicted.resp,
mode = 'everything',
positive = 'YES'
)
}
trControl <- trainControl(
method = "cv",
number = 7,
savePredictions = "final",
# index = createResample(as.factor(train.data$RISK), 7),
classProbs = TRUE,
summaryFunction = multiClassSummary
)
multinom.grid <- expand.grid(decay = 0)
xgbTreeGrid <-
expand.grid(
nrounds = 500,
max_depth = seq(2, 8, by = 1),
eta = 0.1,
gamma = 0,
colsample_bytree = 1.0,
subsample = 1.0,
min_child_weight = 4
)
glmnetGridElastic <-
expand.grid(.alpha = 0.3, .lambda = 0.009) ## notice the . before the parameter
gbm.tune.grid <-
expand.grid(
n.trees = c(400),
interaction.depth = c(1, 3, 5),
shrinkage = c(.01, .1, .3),
n.minobsinnode = c(5, 10, 15)
)
set.seed(333)
modelList <- caretList(
RISK ~ .,
train.data,
trControl = trControl,
metric = "logLoss",
verbose = TRUE,
tuneList = list(
# Multinomial Logistic regression is using multinom method from nnet package
multinom = caretModelSpec(method = 'multinom',
maxit = 150,
tuneGrid = multinom.grid),
## Do not use custom names in list. Will give prediction error with greedy ensemble. Bug in caret.
xgbTree = caretModelSpec(
method = "xgbTree",
tuneGrid = xgbTreeGrid,
nthread = 8
),
glmnet = caretModelSpec(method = "glmnet", tuneGrid = glmnetGridElastic),
# Elastic, highly correlated with lasso and ridge regressions
rf = caretModelSpec(
method = "rf",
ntree = 2000,
tuneLength = 20,
tuneGrid = data.frame(mtry = 10)
),
# rf
gbm = caretModelSpec(method = "gbm", tuneGrid = gbm.tune.grid)
)
)
## Warning in trControlCheck(x = trControl, y = target): indexes not defined in
## trControl. Attempting to set them ourselves, so each model in the ensemble will
## have the same resampling indexes.
## # weights: 63 (40 variable)
## initial value 3091.494980
## iter 10 value 2610.059596
## iter 20 value 2240.432174
## iter 30 value 2120.343168
## iter 40 value 2102.297073
## iter 50 value 2101.755794
## final value 2101.755759
## converged
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0986 nan 0.0100 0.0154
## 2 1.0902 nan 0.0100 0.0149
## 3 1.0822 nan 0.0100 0.0152
## 4 1.0739 nan 0.0100 0.0144
## 5 1.0660 nan 0.0100 0.0136
## 6 1.0586 nan 0.0100 0.0130
## 7 1.0514 nan 0.0100 0.0131
## 8 1.0443 nan 0.0100 0.0122
## 9 1.0374 nan 0.0100 0.0118
## 10 1.0307 nan 0.0100 0.0119
## 20 0.9728 nan 0.0100 0.0092
## 40 0.8949 nan 0.0100 0.0049
## 60 0.8465 nan 0.0100 0.0031
## 80 0.8146 nan 0.0100 0.0019
## 100 0.7926 nan 0.0100 0.0009
## 120 0.7762 nan 0.0100 0.0005
## 140 0.7634 nan 0.0100 0.0005
## 160 0.7529 nan 0.0100 0.0005
## 180 0.7436 nan 0.0100 0.0002
## 200 0.7360 nan 0.0100 -0.0000
## 220 0.7287 nan 0.0100 0.0001
## 240 0.7221 nan 0.0100 0.0001
## 260 0.7158 nan 0.0100 0.0001
## 280 0.7106 nan 0.0100 -0.0001
## 300 0.7054 nan 0.0100 -0.0001
## 320 0.7007 nan 0.0100 -0.0001
## 340 0.6962 nan 0.0100 -0.0001
## 360 0.6919 nan 0.0100 -0.0003
## 380 0.6875 nan 0.0100 -0.0001
## 400 0.6832 nan 0.0100 -0.0001
validateAndPrintResult(modelList$multinom, test.data)
## Penalized Multinomial Regression
##
## 2814 samples
## 19 predictor
## 3 classes: 'H', 'L', 'M'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ...
## Resampling results:
##
## logLoss AUC prAUC Accuracy Kappa Mean_F1
## 0.7664346 0.8053847 0.6387627 0.6400277 0.4590557 0.6267216
## Mean_Sensitivity Mean_Specificity Mean_Pos_Pred_Value Mean_Neg_Pred_Value
## 0.6350695 0.8204098 0.627725 0.8257711
## Mean_Precision Mean_Recall Mean_Detection_Rate Mean_Balanced_Accuracy
## 0.627725 0.6350695 0.2133426 0.7277396
##
## Tuning parameter 'decay' was held constant at a value of 0
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.578003211449456
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
##
## Reference
## Prediction H L M
## H 310 10 81
## L 22 317 151
## M 81 78 153
##
## Overall Statistics
##
## Accuracy : 0.6484
## 95% CI : (0.6206, 0.6754)
## No Information Rate : 0.3433
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4716
##
## Mcnemar's Test P-Value : 4.058e-06
##
## Statistics by Class:
##
## Class: H Class: L Class: M
## Sensitivity 0.7506 0.7827 0.3974
## Specificity 0.8848 0.7832 0.8056
## Pos Pred Value 0.7731 0.6469 0.4904
## Neg Pred Value 0.8716 0.8766 0.7396
## Precision 0.7731 0.6469 0.4904
## Recall 0.7506 0.7827 0.3974
## F1 0.7617 0.7084 0.4390
## Prevalence 0.3433 0.3367 0.3200
## Detection Rate 0.2577 0.2635 0.1272
## Detection Prevalence 0.3333 0.4073 0.2594
## Balanced Accuracy 0.8177 0.7830 0.6015
validateAndPrintResult(modelList$multinom, valid.cl.data)
## Penalized Multinomial Regression
##
## 2814 samples
## 19 predictor
## 3 classes: 'H', 'L', 'M'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ...
## Resampling results:
##
## logLoss AUC prAUC Accuracy Kappa Mean_F1
## 0.7664346 0.8053847 0.6387627 0.6400277 0.4590557 0.6267216
## Mean_Sensitivity Mean_Specificity Mean_Pos_Pred_Value Mean_Neg_Pred_Value
## 0.6350695 0.8204098 0.627725 0.8257711
## Mean_Precision Mean_Recall Mean_Detection_Rate Mean_Balanced_Accuracy
## 0.627725 0.6350695 0.2133426 0.7277396
##
## Tuning parameter 'decay' was held constant at a value of 0
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.578003211449456
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
##
## Reference
## Prediction H L M
## H 79 21 21
## L 0 48 35
## M 25 32 42
##
## Overall Statistics
##
## Accuracy : 0.5578
## 95% CI : (0.4999, 0.6145)
## No Information Rate : 0.3432
## P-Value [Acc > NIR] : 1.869e-14
##
## Kappa : 0.3359
##
## Mcnemar's Test P-Value : 8.359e-05
##
## Statistics by Class:
##
## Class: H Class: L Class: M
## Sensitivity 0.7596 0.4752 0.4286
## Specificity 0.7889 0.8267 0.7220
## Pos Pred Value 0.6529 0.5783 0.4242
## Neg Pred Value 0.8626 0.7591 0.7255
## Precision 0.6529 0.5783 0.4242
## Recall 0.7596 0.4752 0.4286
## F1 0.7022 0.5217 0.4264
## Prevalence 0.3432 0.3333 0.3234
## Detection Rate 0.2607 0.1584 0.1386
## Detection Prevalence 0.3993 0.2739 0.3267
## Balanced Accuracy 0.7743 0.6510 0.5753
validateAndPrintResult(modelList$xgbTree, test.data)
## eXtreme Gradient Boosting
##
## 2814 samples
## 19 predictor
## 3 classes: 'H', 'L', 'M'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ...
## Resampling results across tuning parameters:
##
## max_depth logLoss AUC prAUC Accuracy Kappa Mean_F1
## 2 0.7735971 0.8083205 0.6459289 0.6336302 0.4492462 0.6193716
## 3 0.8012811 0.8046506 0.6458706 0.6265220 0.4388260 0.6149632
## 4 0.8368296 0.7993913 0.6425343 0.6229674 0.4337348 0.6151014
## 5 0.8741677 0.7973666 0.6402046 0.6155065 0.4225058 0.6075197
## 6 0.9120783 0.7959673 0.6368211 0.6066276 0.4091690 0.5986975
## 7 0.9360382 0.7962670 0.6378808 0.6105331 0.4151823 0.6038741
## 8 0.9674818 0.7951167 0.6373674 0.6084001 0.4118915 0.6010594
## Mean_Sensitivity Mean_Specificity Mean_Pos_Pred_Value Mean_Neg_Pred_Value
## 0.6287187 0.8170270 0.6210115 0.8230597
## 0.6219906 0.8136201 0.6153662 0.8179100
## 0.6190170 0.8119166 0.6154902 0.8144229
## 0.6113308 0.8082161 0.6069299 0.8105530
## 0.6024505 0.8037839 0.5976849 0.8059701
## 0.6065675 0.8058002 0.6036407 0.8074718
## 0.6043163 0.8046929 0.6004833 0.8066484
## Mean_Precision Mean_Recall Mean_Detection_Rate Mean_Balanced_Accuracy
## 0.6210115 0.6287187 0.2112101 0.7228728
## 0.6153662 0.6219906 0.2088407 0.7178054
## 0.6154902 0.6190170 0.2076558 0.7154668
## 0.6069299 0.6113308 0.2051688 0.7097735
## 0.5976849 0.6024505 0.2022092 0.7031172
## 0.6036407 0.6065675 0.2035110 0.7061838
## 0.6004833 0.6043163 0.2028000 0.7045046
##
## Tuning parameter 'nrounds' was held constant at a value of 500
## Tuning
## parameter 'min_child_weight' was held constant at a value of 4
##
## Tuning parameter 'subsample' was held constant at a value of 1
## logLoss was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 500, max_depth = 2, eta
## = 0.1, gamma = 0, colsample_bytree = 1, min_child_weight = 4 and subsample = 1.
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.579887209008348
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
##
## Reference
## Prediction H L M
## H 295 16 69
## L 33 313 160
## M 85 76 156
##
## Overall Statistics
##
## Accuracy : 0.6351
## 95% CI : (0.6072, 0.6623)
## No Information Rate : 0.3433
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4518
##
## Mcnemar's Test P-Value : 3.68e-08
##
## Statistics by Class:
##
## Class: H Class: L Class: M
## Sensitivity 0.7143 0.7728 0.4052
## Specificity 0.8924 0.7581 0.8032
## Pos Pred Value 0.7763 0.6186 0.4921
## Neg Pred Value 0.8566 0.8680 0.7415
## Precision 0.7763 0.6186 0.4921
## Recall 0.7143 0.7728 0.4052
## F1 0.7440 0.6872 0.4444
## Prevalence 0.3433 0.3367 0.3200
## Detection Rate 0.2452 0.2602 0.1297
## Detection Prevalence 0.3159 0.4206 0.2635
## Balanced Accuracy 0.8033 0.7655 0.6042
validateAndPrintResult(modelList$xgbTree, valid.cl.data)
## eXtreme Gradient Boosting
##
## 2814 samples
## 19 predictor
## 3 classes: 'H', 'L', 'M'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ...
## Resampling results across tuning parameters:
##
## max_depth logLoss AUC prAUC Accuracy Kappa Mean_F1
## 2 0.7735971 0.8083205 0.6459289 0.6336302 0.4492462 0.6193716
## 3 0.8012811 0.8046506 0.6458706 0.6265220 0.4388260 0.6149632
## 4 0.8368296 0.7993913 0.6425343 0.6229674 0.4337348 0.6151014
## 5 0.8741677 0.7973666 0.6402046 0.6155065 0.4225058 0.6075197
## 6 0.9120783 0.7959673 0.6368211 0.6066276 0.4091690 0.5986975
## 7 0.9360382 0.7962670 0.6378808 0.6105331 0.4151823 0.6038741
## 8 0.9674818 0.7951167 0.6373674 0.6084001 0.4118915 0.6010594
## Mean_Sensitivity Mean_Specificity Mean_Pos_Pred_Value Mean_Neg_Pred_Value
## 0.6287187 0.8170270 0.6210115 0.8230597
## 0.6219906 0.8136201 0.6153662 0.8179100
## 0.6190170 0.8119166 0.6154902 0.8144229
## 0.6113308 0.8082161 0.6069299 0.8105530
## 0.6024505 0.8037839 0.5976849 0.8059701
## 0.6065675 0.8058002 0.6036407 0.8074718
## 0.6043163 0.8046929 0.6004833 0.8066484
## Mean_Precision Mean_Recall Mean_Detection_Rate Mean_Balanced_Accuracy
## 0.6210115 0.6287187 0.2112101 0.7228728
## 0.6153662 0.6219906 0.2088407 0.7178054
## 0.6154902 0.6190170 0.2076558 0.7154668
## 0.6069299 0.6113308 0.2051688 0.7097735
## 0.5976849 0.6024505 0.2022092 0.7031172
## 0.6036407 0.6065675 0.2035110 0.7061838
## 0.6004833 0.6043163 0.2028000 0.7045046
##
## Tuning parameter 'nrounds' was held constant at a value of 500
## Tuning
## parameter 'min_child_weight' was held constant at a value of 4
##
## Tuning parameter 'subsample' was held constant at a value of 1
## logLoss was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 500, max_depth = 2, eta
## = 0.1, gamma = 0, colsample_bytree = 1, min_child_weight = 4 and subsample = 1.
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.579887209008348
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
##
## Reference
## Prediction H L M
## H 99 39 24
## L 0 45 35
## M 5 17 39
##
## Overall Statistics
##
## Accuracy : 0.604
## 95% CI : (0.5464, 0.6594)
## No Information Rate : 0.3432
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.403
##
## Mcnemar's Test P-Value : 1.841e-12
##
## Statistics by Class:
##
## Class: H Class: L Class: M
## Sensitivity 0.9519 0.4455 0.3980
## Specificity 0.6834 0.8267 0.8927
## Pos Pred Value 0.6111 0.5625 0.6393
## Neg Pred Value 0.9645 0.7489 0.7562
## Precision 0.6111 0.5625 0.6393
## Recall 0.9519 0.4455 0.3980
## F1 0.7444 0.4972 0.4906
## Prevalence 0.3432 0.3333 0.3234
## Detection Rate 0.3267 0.1485 0.1287
## Detection Prevalence 0.5347 0.2640 0.2013
## Balanced Accuracy 0.8177 0.6361 0.6453
validateAndPrintResult(modelList$glmnet, test.data)
## glmnet
##
## 2814 samples
## 19 predictor
## 3 classes: 'H', 'L', 'M'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ...
## Resampling results:
##
## logLoss AUC prAUC Accuracy Kappa Mean_F1
## 0.7679149 0.8061893 0.636981 0.6410982 0.4607653 0.6292992
## Mean_Sensitivity Mean_Specificity Mean_Pos_Pred_Value Mean_Neg_Pred_Value
## 0.6363986 0.8209763 0.6299618 0.8256112
## Mean_Precision Mean_Recall Mean_Detection_Rate Mean_Balanced_Accuracy
## 0.6299618 0.6363986 0.2136994 0.7286875
##
## Tuning parameter 'alpha' was held constant at a value of 0.3
## Tuning
## parameter 'lambda' was held constant at a value of 0.009
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.585457190383298
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
##
## Reference
## Prediction H L M
## H 305 11 77
## L 23 314 147
## M 85 80 161
##
## Overall Statistics
##
## Accuracy : 0.6484
## 95% CI : (0.6206, 0.6754)
## No Information Rate : 0.3433
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4718
##
## Mcnemar's Test P-Value : 2.055e-05
##
## Statistics by Class:
##
## Class: H Class: L Class: M
## Sensitivity 0.7385 0.7753 0.4182
## Specificity 0.8886 0.7870 0.7983
## Pos Pred Value 0.7761 0.6488 0.4939
## Neg Pred Value 0.8667 0.8734 0.7446
## Precision 0.7761 0.6488 0.4939
## Recall 0.7385 0.7753 0.4182
## F1 0.7568 0.7064 0.4529
## Prevalence 0.3433 0.3367 0.3200
## Detection Rate 0.2535 0.2610 0.1338
## Detection Prevalence 0.3267 0.4023 0.2710
## Balanced Accuracy 0.8136 0.7811 0.6082
validateAndPrintResult(modelList$glmnet, valid.cl.data)
## glmnet
##
## 2814 samples
## 19 predictor
## 3 classes: 'H', 'L', 'M'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ...
## Resampling results:
##
## logLoss AUC prAUC Accuracy Kappa Mean_F1
## 0.7679149 0.8061893 0.636981 0.6410982 0.4607653 0.6292992
## Mean_Sensitivity Mean_Specificity Mean_Pos_Pred_Value Mean_Neg_Pred_Value
## 0.6363986 0.8209763 0.6299618 0.8256112
## Mean_Precision Mean_Recall Mean_Detection_Rate Mean_Balanced_Accuracy
## 0.6299618 0.6363986 0.2136994 0.7286875
##
## Tuning parameter 'alpha' was held constant at a value of 0.3
## Tuning
## parameter 'lambda' was held constant at a value of 0.009
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.585457190383298
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
##
## Reference
## Prediction H L M
## H 80 21 21
## L 0 52 31
## M 24 28 46
##
## Overall Statistics
##
## Accuracy : 0.5875
## 95% CI : (0.5297, 0.6434)
## No Information Rate : 0.3432
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3805
##
## Mcnemar's Test P-Value : 8.894e-05
##
## Statistics by Class:
##
## Class: H Class: L Class: M
## Sensitivity 0.7692 0.5149 0.4694
## Specificity 0.7889 0.8465 0.7463
## Pos Pred Value 0.6557 0.6265 0.4694
## Neg Pred Value 0.8674 0.7773 0.7463
## Precision 0.6557 0.6265 0.4694
## Recall 0.7692 0.5149 0.4694
## F1 0.7080 0.5652 0.4694
## Prevalence 0.3432 0.3333 0.3234
## Detection Rate 0.2640 0.1716 0.1518
## Detection Prevalence 0.4026 0.2739 0.3234
## Balanced Accuracy 0.7791 0.6807 0.6079
validateAndPrintResult(modelList$rf, test.data)
## Random Forest
##
## 2814 samples
## 19 predictor
## 3 classes: 'H', 'L', 'M'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ...
## Resampling results:
##
## logLoss AUC prAUC Accuracy Kappa Mean_F1
## 0.7764229 0.8052583 0.6443655 0.637183 0.4548583 0.6261077
## Mean_Sensitivity Mean_Specificity Mean_Pos_Pred_Value Mean_Neg_Pred_Value
## 0.6327145 0.8189463 0.6273004 0.8232955
## Mean_Precision Mean_Recall Mean_Detection_Rate Mean_Balanced_Accuracy
## 0.6273004 0.6327145 0.2123943 0.7258304
##
## Tuning parameter 'mtry' was held constant at a value of 10
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.566664724789867
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
##
## Reference
## Prediction H L M
## H 297 17 79
## L 31 316 158
## M 85 72 148
##
## Overall Statistics
##
## Accuracy : 0.6326
## 95% CI : (0.6046, 0.6599)
## No Information Rate : 0.3433
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4478
##
## Mcnemar's Test P-Value : 5.988e-08
##
## Statistics by Class:
##
## Class: H Class: L Class: M
## Sensitivity 0.7191 0.7802 0.3844
## Specificity 0.8785 0.7632 0.8081
## Pos Pred Value 0.7557 0.6257 0.4852
## Neg Pred Value 0.8568 0.8725 0.7361
## Precision 0.7557 0.6257 0.4852
## Recall 0.7191 0.7802 0.3844
## F1 0.7370 0.6945 0.4290
## Prevalence 0.3433 0.3367 0.3200
## Detection Rate 0.2469 0.2627 0.1230
## Detection Prevalence 0.3267 0.4198 0.2535
## Balanced Accuracy 0.7988 0.7717 0.5962
validateAndPrintResult(modelList$rf, valid.cl.data)
## Random Forest
##
## 2814 samples
## 19 predictor
## 3 classes: 'H', 'L', 'M'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ...
## Resampling results:
##
## logLoss AUC prAUC Accuracy Kappa Mean_F1
## 0.7764229 0.8052583 0.6443655 0.637183 0.4548583 0.6261077
## Mean_Sensitivity Mean_Specificity Mean_Pos_Pred_Value Mean_Neg_Pred_Value
## 0.6327145 0.8189463 0.6273004 0.8232955
## Mean_Precision Mean_Recall Mean_Detection_Rate Mean_Balanced_Accuracy
## 0.6273004 0.6327145 0.2123943 0.7258304
##
## Tuning parameter 'mtry' was held constant at a value of 10
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.566664724789867
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
##
## Reference
## Prediction H L M
## H 88 25 21
## L 0 56 33
## M 16 20 44
##
## Overall Statistics
##
## Accuracy : 0.6205
## 95% CI : (0.5632, 0.6753)
## No Information Rate : 0.3432
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4292
##
## Mcnemar's Test P-Value : 2.391e-06
##
## Statistics by Class:
##
## Class: H Class: L Class: M
## Sensitivity 0.8462 0.5545 0.4490
## Specificity 0.7688 0.8366 0.8244
## Pos Pred Value 0.6567 0.6292 0.5500
## Neg Pred Value 0.9053 0.7897 0.7578
## Precision 0.6567 0.6292 0.5500
## Recall 0.8462 0.5545 0.4490
## F1 0.7395 0.5895 0.4944
## Prevalence 0.3432 0.3333 0.3234
## Detection Rate 0.2904 0.1848 0.1452
## Detection Prevalence 0.4422 0.2937 0.2640
## Balanced Accuracy 0.8075 0.6955 0.6367
validateAndPrintResult(modelList$gbm, test.data)
## Stochastic Gradient Boosting
##
## 2814 samples
## 19 predictor
## 3 classes: 'H', 'L', 'M'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.minobsinnode logLoss AUC prAUC
## 0.01 1 5 0.7970860 0.8017350 0.6379452
## 0.01 1 10 0.7967029 0.8019479 0.6363702
## 0.01 1 15 0.7969933 0.8014128 0.6363660
## 0.01 3 5 0.7778815 0.8084940 0.6454769
## 0.01 3 10 0.7776951 0.8091191 0.6466037
## 0.01 3 15 0.7775230 0.8090222 0.6469494
## 0.01 5 5 0.7727954 0.8105842 0.6476454
## 0.01 5 10 0.7726221 0.8103953 0.6485085
## 0.01 5 15 0.7724906 0.8112027 0.6487959
## 0.10 1 5 0.7831853 0.8038137 0.6403604
## 0.10 1 10 0.7859896 0.8024619 0.6376919
## 0.10 1 15 0.7820925 0.8050564 0.6400199
## 0.10 3 5 0.8173556 0.8012995 0.6469857
## 0.10 3 10 0.8193106 0.7987970 0.6393842
## 0.10 3 15 0.8158653 0.8020714 0.6413022
## 0.10 5 5 0.8718596 0.7963071 0.6367853
## 0.10 5 10 0.8622812 0.7993938 0.6429804
## 0.10 5 15 0.8605530 0.7967428 0.6380514
## 0.30 1 5 0.8141373 0.7934245 0.6306437
## 0.30 1 10 0.8093852 0.7971067 0.6354846
## 0.30 1 15 0.8137835 0.7955163 0.6318490
## 0.30 3 5 1.0328940 0.7802969 0.6205405
## 0.30 3 10 1.0414531 0.7770668 0.6178356
## 0.30 3 15 1.0196153 0.7769416 0.6153279
## 0.30 5 5 1.2078892 0.7809344 0.6235276
## 0.30 5 10 1.2320716 0.7711913 0.6087259
## 0.30 5 15 1.2184465 0.7680801 0.6028812
## Accuracy Kappa Mean_F1 Mean_Sensitivity Mean_Specificity
## 0.6315139 0.4451687 0.6056652 0.6252168 0.8155071
## 0.6268898 0.4381002 0.5990852 0.6204141 0.8131440
## 0.6297327 0.4424632 0.6035038 0.6233753 0.8146086
## 0.6450073 0.4662700 0.6299346 0.6399955 0.8226752
## 0.6418090 0.4614145 0.6259860 0.6367284 0.8210489
## 0.6364758 0.4534161 0.6205193 0.6313559 0.8184055
## 0.6485609 0.4717146 0.6349474 0.6437288 0.8244979
## 0.6425126 0.4625664 0.6279106 0.6375671 0.8214506
## 0.6407349 0.4599161 0.6261444 0.6358584 0.8205658
## 0.6343348 0.4504137 0.6214377 0.6295675 0.8174407
## 0.6304310 0.4444873 0.6170718 0.6255908 0.8154557
## 0.6307855 0.4449879 0.6169925 0.6258732 0.8156174
## 0.6204834 0.4298616 0.6116397 0.6161000 0.8106598
## 0.6197691 0.4286818 0.6089492 0.6152697 0.8102324
## 0.6176378 0.4255104 0.6073076 0.6131235 0.8091903
## 0.6069751 0.4097143 0.5993592 0.6027814 0.8039770
## 0.6208370 0.4305453 0.6137082 0.6168690 0.8108651
## 0.6105190 0.4150676 0.6029256 0.6063820 0.8057571
## 0.6133734 0.4189726 0.6014018 0.6086365 0.8069941
## 0.6208308 0.4302212 0.6095685 0.6161740 0.8107442
## 0.6176370 0.4253446 0.6054912 0.6128929 0.8091082
## 0.5927498 0.3882413 0.5844321 0.5885879 0.7967642
## 0.5952542 0.3922162 0.5885796 0.5912674 0.7981450
## 0.6005803 0.4002080 0.5939768 0.5966115 0.8008033
## 0.6083877 0.4118537 0.6009379 0.6044192 0.8046471
## 0.5838745 0.3754205 0.5794714 0.5801981 0.7926145
## 0.5835271 0.3746698 0.5773306 0.5794632 0.7923528
## Mean_Pos_Pred_Value Mean_Neg_Pred_Value Mean_Precision Mean_Recall
## 0.6103709 0.8267023 0.6103709 0.6252168
## 0.6037873 0.8250766 0.6037873 0.6204141
## 0.6079959 0.8259423 0.6079959 0.6233753
## 0.6321172 0.8291784 0.6321172 0.6399955
## 0.6285924 0.8279356 0.6285924 0.6367284
## 0.6216607 0.8251693 0.6216607 0.6313559
## 0.6374184 0.8303460 0.6374184 0.6437288
## 0.6295434 0.8276807 0.6295434 0.6375671
## 0.6278351 0.8267441 0.6278351 0.6358584
## 0.6222568 0.8226338 0.6222568 0.6295675
## 0.6176608 0.8208076 0.6176608 0.6255908
## 0.6173933 0.8211245 0.6173933 0.6258732
## 0.6106109 0.8133363 0.6106109 0.6161000
## 0.6074786 0.8139173 0.6074786 0.6152697
## 0.6060231 0.8126394 0.6060231 0.6131235
## 0.5980222 0.8059655 0.5980222 0.6027814
## 0.6126897 0.8127827 0.6126897 0.6168690
## 0.6014219 0.8077581 0.6014219 0.6063820
## 0.5997755 0.8112085 0.5997755 0.6086365
## 0.6086933 0.8146523 0.6086933 0.6161740
## 0.6046140 0.8135697 0.6046140 0.6128929
## 0.5822637 0.7991150 0.5822637 0.5885879
## 0.5873584 0.7996838 0.5873584 0.5912674
## 0.5926529 0.8023183 0.5926529 0.5966115
## 0.5992886 0.8066503 0.5992886 0.6044192
## 0.5792217 0.7930418 0.5792217 0.5801981
## 0.5767630 0.7935956 0.5767630 0.5794632
## Mean_Detection_Rate Mean_Balanced_Accuracy
## 0.2105046 0.7203620
## 0.2089633 0.7167790
## 0.2099109 0.7189920
## 0.2150024 0.7313353
## 0.2139363 0.7288886
## 0.2121586 0.7248807
## 0.2161870 0.7341133
## 0.2141709 0.7295088
## 0.2135783 0.7282121
## 0.2114449 0.7235041
## 0.2101437 0.7205232
## 0.2102618 0.7207453
## 0.2068278 0.7133799
## 0.2065897 0.7127511
## 0.2058793 0.7111569
## 0.2023250 0.7033792
## 0.2069457 0.7138670
## 0.2035063 0.7060696
## 0.2044578 0.7078153
## 0.2069436 0.7134591
## 0.2058790 0.7110005
## 0.1975833 0.6926761
## 0.1984181 0.6947062
## 0.2001934 0.6987074
## 0.2027959 0.7045331
## 0.1946248 0.6864063
## 0.1945090 0.6859080
##
## Tuning parameter 'n.trees' was held constant at a value of 400
## logLoss was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 400, interaction.depth =
## 5, shrinkage = 0.01 and n.minobsinnode = 15.
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.583696492574584
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
##
## Reference
## Prediction H L M
## H 303 13 69
## L 30 321 162
## M 80 71 154
##
## Overall Statistics
##
## Accuracy : 0.6467
## 95% CI : (0.619, 0.6738)
## No Information Rate : 0.3433
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4691
##
## Mcnemar's Test P-Value : 2.374e-09
##
## Statistics by Class:
##
## Class: H Class: L Class: M
## Sensitivity 0.7337 0.7926 0.4000
## Specificity 0.8962 0.7594 0.8154
## Pos Pred Value 0.7870 0.6257 0.5049
## Neg Pred Value 0.8655 0.8783 0.7428
## Precision 0.7870 0.6257 0.5049
## Recall 0.7337 0.7926 0.4000
## F1 0.7594 0.6993 0.4464
## Prevalence 0.3433 0.3367 0.3200
## Detection Rate 0.2519 0.2668 0.1280
## Detection Prevalence 0.3200 0.4264 0.2535
## Balanced Accuracy 0.8149 0.7760 0.6077
validateAndPrintResult(modelList$gbm, valid.cl.data)
## Stochastic Gradient Boosting
##
## 2814 samples
## 19 predictor
## 3 classes: 'H', 'L', 'M'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.minobsinnode logLoss AUC prAUC
## 0.01 1 5 0.7970860 0.8017350 0.6379452
## 0.01 1 10 0.7967029 0.8019479 0.6363702
## 0.01 1 15 0.7969933 0.8014128 0.6363660
## 0.01 3 5 0.7778815 0.8084940 0.6454769
## 0.01 3 10 0.7776951 0.8091191 0.6466037
## 0.01 3 15 0.7775230 0.8090222 0.6469494
## 0.01 5 5 0.7727954 0.8105842 0.6476454
## 0.01 5 10 0.7726221 0.8103953 0.6485085
## 0.01 5 15 0.7724906 0.8112027 0.6487959
## 0.10 1 5 0.7831853 0.8038137 0.6403604
## 0.10 1 10 0.7859896 0.8024619 0.6376919
## 0.10 1 15 0.7820925 0.8050564 0.6400199
## 0.10 3 5 0.8173556 0.8012995 0.6469857
## 0.10 3 10 0.8193106 0.7987970 0.6393842
## 0.10 3 15 0.8158653 0.8020714 0.6413022
## 0.10 5 5 0.8718596 0.7963071 0.6367853
## 0.10 5 10 0.8622812 0.7993938 0.6429804
## 0.10 5 15 0.8605530 0.7967428 0.6380514
## 0.30 1 5 0.8141373 0.7934245 0.6306437
## 0.30 1 10 0.8093852 0.7971067 0.6354846
## 0.30 1 15 0.8137835 0.7955163 0.6318490
## 0.30 3 5 1.0328940 0.7802969 0.6205405
## 0.30 3 10 1.0414531 0.7770668 0.6178356
## 0.30 3 15 1.0196153 0.7769416 0.6153279
## 0.30 5 5 1.2078892 0.7809344 0.6235276
## 0.30 5 10 1.2320716 0.7711913 0.6087259
## 0.30 5 15 1.2184465 0.7680801 0.6028812
## Accuracy Kappa Mean_F1 Mean_Sensitivity Mean_Specificity
## 0.6315139 0.4451687 0.6056652 0.6252168 0.8155071
## 0.6268898 0.4381002 0.5990852 0.6204141 0.8131440
## 0.6297327 0.4424632 0.6035038 0.6233753 0.8146086
## 0.6450073 0.4662700 0.6299346 0.6399955 0.8226752
## 0.6418090 0.4614145 0.6259860 0.6367284 0.8210489
## 0.6364758 0.4534161 0.6205193 0.6313559 0.8184055
## 0.6485609 0.4717146 0.6349474 0.6437288 0.8244979
## 0.6425126 0.4625664 0.6279106 0.6375671 0.8214506
## 0.6407349 0.4599161 0.6261444 0.6358584 0.8205658
## 0.6343348 0.4504137 0.6214377 0.6295675 0.8174407
## 0.6304310 0.4444873 0.6170718 0.6255908 0.8154557
## 0.6307855 0.4449879 0.6169925 0.6258732 0.8156174
## 0.6204834 0.4298616 0.6116397 0.6161000 0.8106598
## 0.6197691 0.4286818 0.6089492 0.6152697 0.8102324
## 0.6176378 0.4255104 0.6073076 0.6131235 0.8091903
## 0.6069751 0.4097143 0.5993592 0.6027814 0.8039770
## 0.6208370 0.4305453 0.6137082 0.6168690 0.8108651
## 0.6105190 0.4150676 0.6029256 0.6063820 0.8057571
## 0.6133734 0.4189726 0.6014018 0.6086365 0.8069941
## 0.6208308 0.4302212 0.6095685 0.6161740 0.8107442
## 0.6176370 0.4253446 0.6054912 0.6128929 0.8091082
## 0.5927498 0.3882413 0.5844321 0.5885879 0.7967642
## 0.5952542 0.3922162 0.5885796 0.5912674 0.7981450
## 0.6005803 0.4002080 0.5939768 0.5966115 0.8008033
## 0.6083877 0.4118537 0.6009379 0.6044192 0.8046471
## 0.5838745 0.3754205 0.5794714 0.5801981 0.7926145
## 0.5835271 0.3746698 0.5773306 0.5794632 0.7923528
## Mean_Pos_Pred_Value Mean_Neg_Pred_Value Mean_Precision Mean_Recall
## 0.6103709 0.8267023 0.6103709 0.6252168
## 0.6037873 0.8250766 0.6037873 0.6204141
## 0.6079959 0.8259423 0.6079959 0.6233753
## 0.6321172 0.8291784 0.6321172 0.6399955
## 0.6285924 0.8279356 0.6285924 0.6367284
## 0.6216607 0.8251693 0.6216607 0.6313559
## 0.6374184 0.8303460 0.6374184 0.6437288
## 0.6295434 0.8276807 0.6295434 0.6375671
## 0.6278351 0.8267441 0.6278351 0.6358584
## 0.6222568 0.8226338 0.6222568 0.6295675
## 0.6176608 0.8208076 0.6176608 0.6255908
## 0.6173933 0.8211245 0.6173933 0.6258732
## 0.6106109 0.8133363 0.6106109 0.6161000
## 0.6074786 0.8139173 0.6074786 0.6152697
## 0.6060231 0.8126394 0.6060231 0.6131235
## 0.5980222 0.8059655 0.5980222 0.6027814
## 0.6126897 0.8127827 0.6126897 0.6168690
## 0.6014219 0.8077581 0.6014219 0.6063820
## 0.5997755 0.8112085 0.5997755 0.6086365
## 0.6086933 0.8146523 0.6086933 0.6161740
## 0.6046140 0.8135697 0.6046140 0.6128929
## 0.5822637 0.7991150 0.5822637 0.5885879
## 0.5873584 0.7996838 0.5873584 0.5912674
## 0.5926529 0.8023183 0.5926529 0.5966115
## 0.5992886 0.8066503 0.5992886 0.6044192
## 0.5792217 0.7930418 0.5792217 0.5801981
## 0.5767630 0.7935956 0.5767630 0.5794632
## Mean_Detection_Rate Mean_Balanced_Accuracy
## 0.2105046 0.7203620
## 0.2089633 0.7167790
## 0.2099109 0.7189920
## 0.2150024 0.7313353
## 0.2139363 0.7288886
## 0.2121586 0.7248807
## 0.2161870 0.7341133
## 0.2141709 0.7295088
## 0.2135783 0.7282121
## 0.2114449 0.7235041
## 0.2101437 0.7205232
## 0.2102618 0.7207453
## 0.2068278 0.7133799
## 0.2065897 0.7127511
## 0.2058793 0.7111569
## 0.2023250 0.7033792
## 0.2069457 0.7138670
## 0.2035063 0.7060696
## 0.2044578 0.7078153
## 0.2069436 0.7134591
## 0.2058790 0.7110005
## 0.1975833 0.6926761
## 0.1984181 0.6947062
## 0.2001934 0.6987074
## 0.2027959 0.7045331
## 0.1946248 0.6864063
## 0.1945090 0.6859080
##
## Tuning parameter 'n.trees' was held constant at a value of 400
## logLoss was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 400, interaction.depth =
## 5, shrinkage = 0.01 and n.minobsinnode = 15.
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.583696492574584
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
##
## Reference
## Prediction H L M
## H 97 24 26
## L 0 64 30
## M 7 13 42
##
## Overall Statistics
##
## Accuracy : 0.67
## 95% CI : (0.6139, 0.7227)
## No Information Rate : 0.3432
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5029
##
## Mcnemar's Test P-Value : 4.736e-09
##
## Statistics by Class:
##
## Class: H Class: L Class: M
## Sensitivity 0.9327 0.6337 0.4286
## Specificity 0.7487 0.8515 0.9024
## Pos Pred Value 0.6599 0.6809 0.6774
## Neg Pred Value 0.9551 0.8230 0.7676
## Precision 0.6599 0.6809 0.6774
## Recall 0.9327 0.6337 0.4286
## F1 0.7729 0.6564 0.5250
## Prevalence 0.3432 0.3333 0.3234
## Detection Rate 0.3201 0.2112 0.1386
## Detection Prevalence 0.4851 0.3102 0.2046
## Balanced Accuracy 0.8407 0.7426 0.6655
h2o.data <- class.data
# Create the training and test datasets
set.seed(100)
h2o.data$RISK <- as.factor(h2o.data$RISK)
# Step 1: Get row numbers for the training data
trainRowNumbers.cl <-
createDataPartition(h2o.data$RISK, p = 0.75, list = FALSE)
# Step 2: Create the training dataset
train.data <- h2o.data[trainRowNumbers.cl, ]
# Step 3: Create the test dataset
test.data <- h2o.data[-trainRowNumbers.cl, ]
train.data <- as.h2o(train.data)
##
|
| | 0%
|
|======================================================================| 100%
test.data <- as.h2o(test.data)
##
|
| | 0%
|
|======================================================================| 100%
# Identify predictors and response
y <- "RISK"
x <- setdiff(names(h2o.data), c("RISK"))
# For binary classification, response should be a factor
train.data[,y] <- as.factor(train.data[,y])
test.data[,y] <- as.factor(test.data[,y])
# Number of CV folds (to generate level-one data for stacking)
nfolds <- 5
# # 2. Generate a random grid of models and stack them together
#
# # Some XGboost/GBM /rf hyperparameters
# hyper_params <- list(
# ntrees = seq(10, 1000, 1),
# learn_rate = seq(0.0001, 0.2, 0.0001),
# max_depth = seq(1, 20, 1),
# sample_rate = seq(0.5, 1.0, 0.0001),
# col_sample_rate = seq(0.2, 1.0, 0.0001)
# )
#
# search_criteria <- list(strategy = "RandomDiscrete",
# max_models = 10)
#
# grid.id <- as.character(format(Sys.time(), "%S"))
#
#
# # Train & Cross-validate a RF
# rf_grid <- h2o.grid(
# algorithm = "drf",
# grid_id = paste0("grid_binomial_rf_", grid.id),
# x = x,
# y = y,
# training_frame = train.data,
# seed = 100,
# nfolds = nfolds,
# ntrees = 2500,
# fold_assignment = "Modulo",
# keep_cross_validation_predictions = TRUE
# )
#
#
# gbm_grid <- h2o.grid(
# algorithm = "gbm",
# grid_id = paste0("grid_binomial_gbm_", grid.id),
# x = x,
# y = y,
# training_frame = train.data,
# # ntrees = seq(10, 1000, 1),
# seed = 100,
# nfolds = nfolds,
# fold_assignment = "Modulo",
# keep_cross_validation_predictions = TRUE,
# hyper_params = hyper_params,
# search_criteria = search_criteria
# )
#
#
#
# # Train the grid
# xgb_grid <- h2o.grid(
# algorithm = "xgboost",
# grid_id = paste0("grid_binomial_xgb_", grid.id),
# x = x,
# y = y,
# training_frame = train.data,
# nfolds = nfolds,
# seed = 100,
# fold_assignment = "Modulo",
# keep_cross_validation_predictions = TRUE,
# hyper_params = hyper_params,
# search_criteria = search_criteria
# )
#
# # Train a stacked ensemble using the H2O and XGBoost models from above
# base.models <- append(gbm_grid@model_ids,
# xgb_grid@model_ids)
#
# # Train a stacked ensemble using the GBM grid
# ensemble <- h2o.stackedEnsemble(
# x = x,
# y = y,
# model_id = paste0("ensemble_gbm_grid_", grid.id, "_24"),
# training_frame = train.data,
# base_models = base.models
# )
#
# # Eval ensemble performance on a test set
# perf <- h2o.performance(ensemble, newdata = test.data)
#
# # Compare to base learner performance on the test set
# .getmean_per_class_error <-
# function(mm)
# h2o.mean_per_class_error(h2o.performance(h2o.getModel(mm), newdata = test.data))
#
# baselearner_aucs <- sapply(base.models, .getmean_per_class_error)
# baselearner_best_auc_test <- max(baselearner_aucs)
# ensemble_auc_test <- h2o.mean_per_class_error(perf)
# print(sprintf("Best Base-learner Test Mean per class error: %s", baselearner_best_auc_test))
# print(sprintf("Ensemble Test Mean per class error: %s", ensemble_auc_test))
#
# # Generate predictions on a test set (if neccessary)
# pred <- h2o.predict(ensemble, newdata = test.data)
#
# # Sort the grid by CV AUC for GBM
# get_gbm_grid <- h2o.getGrid(grid_id = gbm_grid@grid_id, sort_by = "mean_per_class_error", decreasing = TRUE)
# get_gbm_grid
# gbm_grid_top_model <- get_gbm_grid@summary_table[1, "model_ids"]
# gbm_grid_top_model
#
# # Sort the grid by CV AUC for XGBOOST
# get_xgb_grid <- h2o.getGrid(grid_id = xgb_grid@grid_id, sort_by = "mean_per_class_error", decreasing = TRUE)
# get_xgb_grid
# xgb_grid_top_model <- get_xgb_grid@summary_table[1, "model_ids"]
# xgb_grid_top_model
#
# # Sort the grid by CV AUC for XGBOOST
# get_rf_grid <- h2o.getGrid(grid_id = rf_grid@grid_id, sort_by = "mean_per_class_error", decreasing = TRUE)
# get_rf_grid
# rf_grid_top_model <- get_rf_grid@summary_table[1, "model_ids"]
# rf_grid_top_model
# Use AutoML to find a list of candidate models (i.e., leaderboard)
auto_ml <- h2o.automl(
x = x,
y = y,
training_frame = train.data,
nfolds = 5,
max_runtime_secs = 60 * 120,
max_models = 10,
keep_cross_validation_predictions = FALSE,
sort_metric = "mean_per_class_error",
seed = 123,
stopping_rounds = 50,
stopping_metric = "mean_per_class_error",
stopping_tolerance = 0
)
##
|
| | 0%
## 23:28:58.300: Stopping tolerance set by the user is < 70% of the recommended default of 0.018214966464911487, so models may take a long time to converge or may not converge at all.
|
| | 1%
|
|= | 1%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======== | 11%
|
|========= | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|===================== | 31%
|
|======================= | 33%
|
|======================================================================| 100%
## model_id mean_per_class_error
## 1 XGBoost_3_AutoML_20200226_232858 0.3621127
## 2 StackedEnsemble_BestOfFamily_AutoML_20200226_232858 0.3708717
## 3 DRF_1_AutoML_20200226_232858 0.3725618
## 4 StackedEnsemble_AllModels_AutoML_20200226_232858 0.3736785
## 5 XGBoost_1_AutoML_20200226_232858 0.3788496
## 6 XGBoost_2_AutoML_20200226_232858 0.3789230
## logloss rmse mse
## 1 0.7721607 0.5299974 0.2808972
## 2 0.7720142 0.5239900 0.2745655
## 3 0.8395249 0.5176616 0.2679736
## 4 0.7715062 0.5236370 0.2741957
## 5 0.7714698 0.5284186 0.2792263
## 6 0.7735021 0.5283216 0.2791237
##
## [12 rows x 5 columns]
# Assess the leader board; the following truncates the results to show the top
# and bottom 15 models. You can get the top model with auto_ml@leader
auto_ml@leaderboard %>%
as.data.frame() %>%
dplyr::select(model_id, mean_per_class_error) %>%
dplyr::slice(1:25)
## model_id mean_per_class_error
## 1 XGBoost_3_AutoML_20200226_232858 0.3621127
## 2 StackedEnsemble_BestOfFamily_AutoML_20200226_232858 0.3708717
## 3 DRF_1_AutoML_20200226_232858 0.3725618
## 4 StackedEnsemble_AllModels_AutoML_20200226_232858 0.3736785
## 5 XGBoost_1_AutoML_20200226_232858 0.3788496
## 6 XGBoost_2_AutoML_20200226_232858 0.3789230
## 7 GBM_1_AutoML_20200226_232858 0.3903032
## 8 GBM_2_AutoML_20200226_232858 0.4012999
## 9 GBM_5_AutoML_20200226_232858 0.4040333
## 10 GBM_4_AutoML_20200226_232858 0.4053515
## 11 GBM_3_AutoML_20200226_232858 0.4058087
## 12 GLM_1_AutoML_20200226_232858 0.4239250